In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 8

**Submission Date:** `2025-11-16, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

In [None]:
DATASET = "https://drive.google.com/uc?id=1BljShfWAQ9-0O6PAurjC-XIX4_EuLMpw&export=download"

In [None]:
df = pd.read_csv(DATASET)

In [None]:
df.sample()

In [None]:
X = df.drop("sentiment", axis=1)
y = df["sentiment"]

## Question 1
Exploring the dataset
* Load the dataset into colab enviornment.
* Split the dataset into train set and test set with 20% dataset belonging to test set. Keep the random_state=0.
* Determine the number of unique words in the text column of train set. Consider all words, including stop words, and ignore case differences.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(X_train["text"])

In [None]:
len(vectorizer.get_feature_names_out())

## Question 2

### Preprocessing

Apply preprocessing to the features of both the training and test datasets as follows:
- For numerical Feature (Land Area (Km²))
  * scale using StandardScaler.

- For ordinal feautres (Density_Level, Population_Group)
  * Apply OrdinalEncoder with the following category mapping: "Low" → 0, "Medium" → 1, "High" → 2.

- For nominal features (Age of User, Time of Tweet, Continent)
  * Use OneHotEncoder with sparse_ouput=False and drop=first

- For text features (text)
  * Apply TfidfVectorizer with the following parameters:
    * lowercase=True
    * stop_words=english
    * max_features=5000
    * ngram_range=(1, 2)
    * token_pattern=r(?u)\b\w\w+\b|[@#]\w+ (to include hashtags and mentions)
    * strip_accents=unicode (to normalize characters like "é")

Calculate the sum of all the values present in first five rows of transformed test feature matrix ? (upto 2 digits after the decimal)

In [None]:
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), ["Land Area (Km²)"]),
        (
            "ord",
            OrdinalEncoder(
                categories=[
                    ["Low", "Medium", "High"],
                    ["Low", "Medium", "High"],
                ]
            ),
            ["Density_Level", "Population_Group"],
        ),
        (
            "nom",
            OneHotEncoder(sparse_output=False, drop="first"),
            ["Age of User", "Time of Tweet", "Continent"],
        ),
        (
            "text",
            TfidfVectorizer(
                lowercase=True,
                stop_words="english",
                max_features=5000,
                ngram_range=(1, 2),
                token_pattern=r"(?u)\b\w\w+\b|[@#]\w+",
                strip_accents="unicode",
            ),
            "text",
        ),
    ],
    remainder="passthrough",
)

In [None]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

In [None]:
round(X_test_trans.toarray()[:5].sum(), 2)

## Question 3

### Model Building
Train a MultinomialNB model on the preprocessed training dataset, excluding the Land Area (Km²) column, as MultinomialNB does not support negative input values. Then, evaluate the model by calculating the log_loss on the test dataset, also excluding the Land Area (Km²) feature.
> **Note:** The Land Area (Km²) feature is excluded only for this specific question.

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_trans[:, 1:], y_train)
log_loss(y_test, mnb.predict_proba(X_test_trans[:, 1:]))

## Question 4

### Error Analysis

Train a RandomForestClassifier with random_state=42 on the preprocessed training dataset. This time, include all features, including Land Area (Km²), in both the training and test datasets.

After training, evaluate the model on the preprocessed test dataset. Which class did the model find most confusing (as per preprocessed test dataset)?

In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_trans, y_train)

In [None]:
rfc.score(X_test_trans, y_test)

In [None]:
rfc.classes_

In [None]:
conf_matrix = confusion_matrix(y_test, rfc.predict(X_test_trans))
conf_matrix

In [None]:
# conf_matrix.sum(axis=1) # [1348+184, 267+1474]
# conf_matrix.diagonal() # [1348, 1474]

In [None]:
misclassifications = conf_matrix.sum(axis=1) - conf_matrix.diagonal()
misclassifications

In [None]:
rfc.classes_[misclassifications.argmax()]

## Question 5
### Feature selection

Use RFECV with

- estimator=LogisticRegression(random_state=42, max_iter=1000)
- step=100
- n_jobs = -1

How many features got selected?

In [None]:
refcv = RFECV(
    estimator=LogisticRegression(random_state=42, max_iter=1000),
    step=100,
    n_jobs=-1,
)

refcv.fit(X_train_trans, y_train)

In [None]:
refcv.n_features_